Load Packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(ggpubr)
library(FinancialMath)
library(ggsignif)

Loading Data & Pre-Processing

Neighborhood - Zipcode Matching Data: We’ll need this to assign each property to a neighborhood

neighborhood_data <- read_csv("data/nyc_zip_borough_neighborhoods_pop.csv")
## Rows: 177 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): borough, post_office, neighborhood
## dbl (3): zip, population, density
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
neighborhood_data
## # A tibble: 177 × 6
##      zip borough   post_office  neighborhood                  population density
##    <dbl> <chr>     <chr>        <chr>                              <dbl>   <dbl>
##  1 10001 Manhattan New York, NY Chelsea and Clinton                21102   33959
##  2 10002 Manhattan New York, NY Lower East Side                    81410   92573
##  3 10003 Manhattan New York, NY Lower East Side                    56024   97188
##  4 10004 Manhattan New York, NY Lower Manhattan                     3089    5519
##  5 10005 Manhattan New York, NY Lower Manhattan                     7135   97048
##  6 10006 Manhattan New York, NY Lower Manhattan                     3011   32796
##  7 10007 Manhattan New York, NY Lower Manhattan                     6988   42751
##  8 10009 Manhattan New York, NY Lower East Side                    61347   99492
##  9 10010 Manhattan New York, NY Gramercy Park and Murray Hill      31834   81487
## 10 10011 Manhattan New York, NY Chelsea and Clinton                50984   77436
## # … with 167 more rows

Rental Data:

rental_data <- read_csv("data/manhattan-ny_rent_2024_03_15.csv")
## New names:
## Rows: 6278 Columns: 17
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): listing_type, unit_description, detailed_url, unit_address, unit_a... dbl
## (10): ...1, Unnamed: 0.1, Unnamed: 0, latitude, longitude, unit_zipcode,...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
rental_data %<>% left_join(neighborhood_data, by=c("unit_zipcode"="zip"))  %>% filter(borough == "Manhattan") # attach neighborhood data
rental_data
## # A tibble: 6,274 × 22
##     ...1 Unnam…¹ Unnam…² listi…³ unit_…⁴ detai…⁵ latit…⁶ longi…⁷ unit_…⁸ unit_…⁹
##    <dbl>   <dbl>   <dbl> <chr>   <chr>   <chr>     <dbl>   <dbl> <chr>   <chr>  
##  1     0       0       0 nested  Riverb… /apart…    40.8   -74.0 560 W … 560 W …
##  2     1       1       1 nested  The Ca… /apart…    40.7   -74.0 776 6t… 776 6t…
##  3     2       2       2 nested  Waters… /apart…    40.7   -74.0 25 Wat… 25 Wat…
##  4     3       3       3 nested  Pearl … /apart…    40.7   -74.0 160 Wa… 160 Wa…
##  5     4       4       4 nested  Avalon… /apart…    40.7   -74.0 11 E 1… 11 E  …
##  6     5       5       5 nested  21 Che… /apart…    40.7   -74.0 120 W … 120 W …
##  7     6       6       6 nested  The Sa… /apart…    40.8   -74.0 189 W … 189 W …
##  8     7       7       7 nested  19 Dut… /apart…    40.7   -74.0 19 Dut… 19 Dut…
##  9     8       8       8 nested  Anagra… /apart…    40.8   -74.0 1 W 60… 1 W  6…
## 10     9       9       9 nested  The Ch… /apart…    40.7   -74.0 160 W … 160 W …
## # … with 6,264 more rows, 12 more variables: unit_city <chr>,
## #   unit_zipcode <dbl>, beds <dbl>, baths <dbl>, area <dbl>, price <dbl>,
## #   unit_number <chr>, borough <chr>, post_office <chr>, neighborhood <chr>,
## #   population <dbl>, density <dbl>, and abbreviated variable names
## #   ¹​`Unnamed: 0.1`, ²​`Unnamed: 0`, ³​listing_type, ⁴​unit_description,
## #   ⁵​detailed_url, ⁶​latitude, ⁷​longitude, ⁸​unit_address, ⁹​unit_address_street

Sales Data:

sales_data <- read_csv("data/manhattan-ny_sale_2024_03_15.csv")
## New names:
## • `` -> `...1`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 6992 Columns: 55
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (23): rawHomeStatusCd, marketingStatusSimplifiedCd, imgSrc, detailUrl, ...
## dbl  (11): ...1, zpid, id, providerListingId, unformattedPrice, addressZipco...
## lgl  (19): hasImage, isUndisclosedAddress, isZillowOwned, isSaved, isUserCla...
## dttm  (2): openHouseStartDate, openHouseEndDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#rename some variables
sales_data$price <- sales_data$unformattedPrice
sales_data$sqft <- sales_data$area
sales_data %<>% left_join(neighborhood_data, by=c("addressZipcode"="zip")) %>% filter(borough == "Manhattan")
sales_data
## # A tibble: 6,984 × 61
##     ...1      zpid     id rawHo…¹ marke…² provi…³ imgSrc hasIm…⁴ detai…⁵ statu…⁶
##    <dbl>     <dbl>  <dbl> <chr>   <chr>     <dbl> <chr>  <lgl>   <chr>   <chr>  
##  1    43    3.35e8 3.35e8 ForSale For Sa… 1694903 https… TRUE    https:… FOR_SA…
##  2    44    1.20e8 1.20e8 ForSale For Sa… 1681982 https… TRUE    https:… FOR_SA…
##  3    45    2.06e9 2.06e9 ForSale For Sa… 1700320 https… TRUE    https:… FOR_SA…
##  4    46    2.08e9 2.08e9 ForSale For Sa… 1698963 https… TRUE    https:… FOR_SA…
##  5    47    2.45e8 2.45e8 ForSale For Sa… 1682103 https… TRUE    https:… FOR_SA…
##  6    48    2.05e9 2.05e9 ForSale For Sa… 1690340 https… TRUE    https:… FOR_SA…
##  7    49    2.06e9 2.06e9 ForSale For Sa…      NA https… TRUE    https:… FOR_SA…
##  8    50    2.09e9 2.09e9 ForSale For Sa… 1454319 https… TRUE    https:… FOR_SA…
##  9    51    2.06e9 2.06e9 ForSale For Sa… 1672430 https… TRUE    https:… FOR_SA…
## 10    52    2.06e9 2.06e9 ForSale For Sa… 1632998 https… TRUE    https:… FOR_SA…
## # … with 6,974 more rows, 51 more variables: statusText <chr>,
## #   countryCurrency <chr>, price <dbl>, unformattedPrice <dbl>, address <chr>,
## #   addressStreet <chr>, addressCity <chr>, addressState <chr>,
## #   addressZipcode <dbl>, isUndisclosedAddress <lgl>, beds <dbl>, baths <dbl>,
## #   area <dbl>, latLong <chr>, isZillowOwned <lgl>, variableData <chr>,
## #   hdpData <chr>, isSaved <lgl>, isUserClaimingOwner <lgl>,
## #   isUserConfirmedClaim <lgl>, pgapt <chr>, sgapt <chr>, …

Annotated Sales Data: (this is data where which we have more than 2 historical sales) 1583/6992 = 0.22, so we have about 22% of the sales properties actively listed

sales_data_annotated <- read_csv("data/manhattan-ny_sale_2024_03_15_with_cagr_and_hoa_fees.csv")
## New names:
## Rows: 1583 Columns: 65
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (25): rawHomeStatusCd, marketingStatusSimplifiedCd, imgSrc, detailUrl, ... dbl
## (20): ...1, Unnamed: 0, zpid, id, providerListingId, unformattedPrice, ... lgl
## (18): hasImage, isUndisclosedAddress, isZillowOwned, isSaved, isUserCla... dttm
## (2): openHouseStartDate, openHouseEndDate
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
#rename some variables
sales_data_annotated$price <- sales_data_annotated$unformattedPrice
sales_data_annotated$sqft <- sales_data_annotated$area
sales_data_annotated %<>% left_join(neighborhood_data, by=c("addressZipcode"="zip"))  %>% filter(borough == "Manhattan")
sales_data_annotated
## # A tibble: 1,583 × 71
##     ...1 Unnamed:…¹   zpid     id rawHo…² marke…³ provi…⁴ imgSrc hasIm…⁵ detai…⁶
##    <dbl>      <dbl>  <dbl>  <dbl> <chr>   <chr>     <dbl> <chr>  <lgl>   <chr>  
##  1     3         44 1.20e8 1.20e8 ForSale For Sa… 1681982 https… TRUE    https:…
##  2    21         62 8.20e7 8.20e7 ForSale For Sa… 1682235 https… TRUE    https:…
##  3    38         79 9.46e7 9.46e7 ForSale For Sa… 1672547 https… TRUE    https:…
##  4    55         96 8.19e7 8.19e7 ForSale For Sa… 1701687 https… TRUE    https:…
##  5    56         97 1.26e8 1.26e8 ForSale For Sa… 1701077 https… TRUE    https:…
##  6    64        105 2.20e8 2.20e8 ForSale For Sa… 1696777 https… TRUE    https:…
##  7    75        116 1.20e8 1.20e8 ForSale For Sa…      NA https… TRUE    https:…
##  8    84        125 7.25e7 7.25e7 ForSale For Sa… 1676462 https… TRUE    https:…
##  9    91        132 7.25e7 7.25e7 ForSale For Sa… 1689159 https… TRUE    https:…
## 10    92        133 7.25e7 7.25e7 ForSale For Sa… 1695364 https… TRUE    https:…
## # … with 1,573 more rows, 61 more variables: statusType <chr>,
## #   statusText <chr>, countryCurrency <chr>, price <dbl>,
## #   unformattedPrice <dbl>, address <chr>, addressStreet <chr>,
## #   addressCity <chr>, addressState <chr>, addressZipcode <dbl>,
## #   isUndisclosedAddress <lgl>, beds <dbl>, baths <dbl>, area <dbl>,
## #   latLong <chr>, isZillowOwned <lgl>, variableData <chr>, hdpData <chr>,
## #   isSaved <lgl>, isUserClaimingOwner <lgl>, isUserConfirmedClaim <lgl>, …

Add financing information (assuming a 20% downpayment on a 30 year mortgage)

interest_rate = 0.0772
downpayment_percent = 0.2
air12 = interest_rate / 12 #annual interest rate/12
installments = 12 * 30# assuming a 30-year fixed 

sales_data_annotated %<>% mutate(downpayment = unformattedPrice*downpayment_percent) %>% mutate(monthly_payment = (unformattedPrice-downpayment)*(air12*(1+air12)^installments)/((1+air12)^installments - 1))

sales_data_annotated %<>% mutate(monthly_payment_w_fees = monthly_payment + hoa_fee)
sales_data_annotated
## # A tibble: 1,583 × 74
##     ...1 Unnamed:…¹   zpid     id rawHo…² marke…³ provi…⁴ imgSrc hasIm…⁵ detai…⁶
##    <dbl>      <dbl>  <dbl>  <dbl> <chr>   <chr>     <dbl> <chr>  <lgl>   <chr>  
##  1     3         44 1.20e8 1.20e8 ForSale For Sa… 1681982 https… TRUE    https:…
##  2    21         62 8.20e7 8.20e7 ForSale For Sa… 1682235 https… TRUE    https:…
##  3    38         79 9.46e7 9.46e7 ForSale For Sa… 1672547 https… TRUE    https:…
##  4    55         96 8.19e7 8.19e7 ForSale For Sa… 1701687 https… TRUE    https:…
##  5    56         97 1.26e8 1.26e8 ForSale For Sa… 1701077 https… TRUE    https:…
##  6    64        105 2.20e8 2.20e8 ForSale For Sa… 1696777 https… TRUE    https:…
##  7    75        116 1.20e8 1.20e8 ForSale For Sa…      NA https… TRUE    https:…
##  8    84        125 7.25e7 7.25e7 ForSale For Sa… 1676462 https… TRUE    https:…
##  9    91        132 7.25e7 7.25e7 ForSale For Sa… 1689159 https… TRUE    https:…
## 10    92        133 7.25e7 7.25e7 ForSale For Sa… 1695364 https… TRUE    https:…
## # … with 1,573 more rows, 64 more variables: statusType <chr>,
## #   statusText <chr>, countryCurrency <chr>, price <dbl>,
## #   unformattedPrice <dbl>, address <chr>, addressStreet <chr>,
## #   addressCity <chr>, addressState <chr>, addressZipcode <dbl>,
## #   isUndisclosedAddress <lgl>, beds <dbl>, baths <dbl>, area <dbl>,
## #   latLong <chr>, isZillowOwned <lgl>, variableData <chr>, hdpData <chr>,
## #   isSaved <lgl>, isUserClaimingOwner <lgl>, isUserConfirmedClaim <lgl>, …

Add CONDO vs CO-OP If you look up online, “D4-ELEVATOR APT”, “C6-WALK-UP APARTMENT” buildings are CO-OP buildings while R0-CONDOMINIUMS are CONDO buildings. There are some other weird types in rare cases but they are very few and we want to ignore them for this analysis.

sales_data_annotated %<>% filter(building_type %in% c("D4-ELEVATOR APT", "C6-WALK-UP APARTMENT", "R0-CONDOMINIUMS")) %>% mutate(building_type_simple = case_when(
  building_type %in% c("D4-ELEVATOR APT", "C6-WALK-UP APARTMENT ") ~ "CO-OP",
  building_type %in% c("R0-CONDOMINIUMS") ~ "CONDO"
))

sales_data_annotated %<>% filter(building_type_simple %in% c("CO-OP", "CONDO"))
sales_data_annotated # 1583 - 1368  = 215 (listings removed)
## # A tibble: 1,368 × 75
##     ...1 Unnamed:…¹   zpid     id rawHo…² marke…³ provi…⁴ imgSrc hasIm…⁵ detai…⁶
##    <dbl>      <dbl>  <dbl>  <dbl> <chr>   <chr>     <dbl> <chr>  <lgl>   <chr>  
##  1     3         44 1.20e8 1.20e8 ForSale For Sa… 1681982 https… TRUE    https:…
##  2    21         62 8.20e7 8.20e7 ForSale For Sa… 1682235 https… TRUE    https:…
##  3    38         79 9.46e7 9.46e7 ForSale For Sa… 1672547 https… TRUE    https:…
##  4    55         96 8.19e7 8.19e7 ForSale For Sa… 1701687 https… TRUE    https:…
##  5    64        105 2.20e8 2.20e8 ForSale For Sa… 1696777 https… TRUE    https:…
##  6    84        125 7.25e7 7.25e7 ForSale For Sa… 1676462 https… TRUE    https:…
##  7    91        132 7.25e7 7.25e7 ForSale For Sa… 1689159 https… TRUE    https:…
##  8    92        133 7.25e7 7.25e7 ForSale For Sa… 1695364 https… TRUE    https:…
##  9    95        136 1.22e8 1.22e8 ForSale For Sa… 1628931 https… TRUE    https:…
## 10   110        151 7.25e7 7.25e7 ForSale For Sa… 1697649 https… TRUE    https:…
## # … with 1,358 more rows, 65 more variables: statusType <chr>,
## #   statusText <chr>, countryCurrency <chr>, price <dbl>,
## #   unformattedPrice <dbl>, address <chr>, addressStreet <chr>,
## #   addressCity <chr>, addressState <chr>, addressZipcode <dbl>,
## #   isUndisclosedAddress <lgl>, beds <dbl>, baths <dbl>, area <dbl>,
## #   latLong <chr>, isZillowOwned <lgl>, variableData <chr>, hdpData <chr>,
## #   isSaved <lgl>, isUserClaimingOwner <lgl>, isUserConfirmedClaim <lgl>, …

Analysis

Data Exploration

Modeling Sales / Rental Price

I’m not really interested in this, so this won’t go very deep, but I thought it would be an interesting to give it a shoot and see what comes out…

How beds, bathrooms, sqft, condo vs. co-op and neighborhood affect sales price

lm_fit <- lm(formula = price ~ beds + baths + neighborhood + sqft + building_type_simple, data = sales_data_annotated)
summary(lm_fit)
## 
## Call:
## lm(formula = price ~ beds + baths + neighborhood + sqft + building_type_simple, 
##     data = sales_data_annotated)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -13260409   -966781     68449    739387  31161632 
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                               -2505149.1   461381.9  -5.430
## beds                                       -265270.9   118452.5  -2.239
## baths                                      2056050.5   133410.2  15.411
## neighborhoodChelsea and Clinton            1064340.1   454226.4   2.343
## neighborhoodEast Harlem                     986360.2   736402.2   1.339
## neighborhoodGramercy Park and Murray Hill   720320.9   453687.4   1.588
## neighborhoodGreenwich Village and Soho     1453343.3   516649.0   2.813
## neighborhoodInwood and Washington Heights   899976.3   609754.8   1.476
## neighborhoodLower East Side                1053756.9   520900.9   2.023
## neighborhoodLower Manhattan                 428866.3   490020.9   0.875
## neighborhoodUpper East Side                 279365.2   459893.7   0.607
## neighborhoodUpper West Side                1174989.4   469028.4   2.505
## neighborhoodWest Side                       190457.7   651111.0   0.293
## sqft                                          1191.5      110.1  10.823
## building_type_simpleCONDO                  -684848.3   173475.0  -3.948
##                                           Pr(>|t|)    
## (Intercept)                               6.69e-08 ***
## beds                                       0.02529 *  
## baths                                      < 2e-16 ***
## neighborhoodChelsea and Clinton            0.01926 *  
## neighborhoodEast Harlem                    0.18066    
## neighborhoodGramercy Park and Murray Hill  0.11259    
## neighborhoodGreenwich Village and Soho     0.00498 ** 
## neighborhoodInwood and Washington Heights  0.14019    
## neighborhoodLower East Side                0.04328 *  
## neighborhoodLower Manhattan                0.38162    
## neighborhoodUpper East Side                0.54365    
## neighborhoodUpper West Side                0.01236 *  
## neighborhoodWest Side                      0.76994    
## sqft                                       < 2e-16 ***
## building_type_simpleCONDO                 8.29e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2562000 on 1344 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.5991, Adjusted R-squared:  0.5949 
## F-statistic: 143.4 on 14 and 1344 DF,  p-value: < 2.2e-16

0.59 R^2….its something, but its not great! Interesting baths is more significant than beds (which I’m certain are correlated, but baths probably conveys more information)

Now lets look at rental data

rental_data$sqft <- rental_data$area
lm_fit <- lm(formula = price ~ beds + baths + neighborhood + sqft, data = rental_data)
summary(lm_fit)
## 
## Call:
## lm(formula = price ~ beds + baths + neighborhood + sqft, data = rental_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33194   -998    -63    862  57439 
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                               -4934.1482   553.0199  -8.922
## beds                                       -893.3074   121.2913  -7.365
## baths                                      1496.1275   190.7674   7.843
## neighborhoodChelsea and Clinton            4142.8502   565.2244   7.330
## neighborhoodEast Harlem                    1280.7189   729.3401   1.756
## neighborhoodGramercy Park and Murray Hill  3510.6025   561.4830   6.252
## neighborhoodGreenwich Village and Soho     7476.6158   668.1046  11.191
## neighborhoodInwood and Washington Heights   347.2018   757.4597   0.458
## neighborhoodLower East Side                3796.5844   644.1673   5.894
## neighborhoodLower Manhattan                3187.5096   572.4817   5.568
## neighborhoodTribeca                        4779.4332  2684.6282   1.780
## neighborhoodUpper East Side                3900.3133   607.8712   6.416
## neighborhoodUpper West Side                4170.0659   562.3841   7.415
## neighborhoodWest Side                      4096.6281   736.3199   5.564
## sqft                                          7.4615     0.2012  37.086
##                                           Pr(>|t|)    
## (Intercept)                                < 2e-16 ***
## beds                                      2.51e-13 ***
## baths                                     6.88e-15 ***
## neighborhoodChelsea and Clinton           3.25e-13 ***
## neighborhoodEast Harlem                     0.0792 .  
## neighborhoodGramercy Park and Murray Hill 4.86e-10 ***
## neighborhoodGreenwich Village and Soho     < 2e-16 ***
## neighborhoodInwood and Washington Heights   0.6467    
## neighborhoodLower East Side               4.37e-09 ***
## neighborhoodLower Manhattan               2.90e-08 ***
## neighborhoodTribeca                         0.0752 .  
## neighborhoodUpper East Side               1.71e-10 ***
## neighborhoodUpper West Side               1.74e-13 ***
## neighborhoodWest Side                     2.97e-08 ***
## sqft                                       < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3720 on 2156 degrees of freedom
##   (4103 observations deleted due to missingness)
## Multiple R-squared:  0.7691, Adjusted R-squared:  0.7676 
## F-statistic: 512.9 on 14 and 2156 DF,  p-value: < 2.2e-16

Wow, 0.76 R^2, that’s not bad!

We actually have in rental data the “zestimate” which is Zillow’s predictions, lets see how correlated this compares against that

predicted_values <- predict(lm_fit, newdata = sales_data)
sales_data$predicted_rent <- predicted_values
sales_data %>% select(zestimate, predicted_rent) %>% drop_na() %>% cor()
##                zestimate predicted_rent
## zestimate      1.0000000      0.5564566
## predicted_rent 0.5564566      1.0000000

0.55, so not that great! Zillow’s estimate is pretty different. I saw some negative values in there too, which is a clear sign of not being in the same range and bad extrapolation!

Listing Distribution

# Tally neighborhoods
neighborhood_counts_rentals <- data.frame(table(rental_data$neighborhood))
neighborhood_counts_sales <- data.frame(table(sales_data$neighborhood))

Is the subset of data generally representative of the total sales data (is it skewed towards some neighborhoods over others)?

neighborhood_counts_sales_annotated <- data.frame(table(sales_data_annotated$neighborhood))
neighborhood_counts_sales_annotated %>% arrange(desc(Freq))
##                             Var1 Freq
## 1  Gramercy Park and Murray Hill  312
## 2            Chelsea and Clinton  261
## 3                Upper East Side  245
## 4                Upper West Side  168
## 5                Lower Manhattan  110
## 6     Greenwich Village and Soho   78
## 7                Lower East Side   73
## 8                 Central Harlem   37
## 9  Inwood and Washington Heights   37
## 10                     West Side   28
## 11                   East Harlem   19

Compare the ratio of frequency for each neighborhood, in a perfectly representative subset, the ratio would be exactly the same for each. Here they’re generally between 4-6. I would say the general order is relatively the same and the ratio is pretty consistent, so I think its generally well sampled in terms of neighborhoods.

neighborhood_counts_sales %>% left_join(neighborhood_counts_sales_annotated, by = c("Var1"="Var1")) %>% mutate(ratio = `Freq.x`/`Freq.y`)
##                             Var1 Freq.x Freq.y    ratio
## 1                 Central Harlem    347     37 9.378378
## 2            Chelsea and Clinton   1127    261 4.318008
## 3                    East Harlem    122     19 6.421053
## 4  Gramercy Park and Murray Hill   1344    312 4.307692
## 5     Greenwich Village and Soho    463     78 5.935897
## 6  Inwood and Washington Heights    253     37 6.837838
## 7                Lower East Side    449     73 6.150685
## 8                Lower Manhattan    443    110 4.027273
## 9                        Tribeca     23     NA       NA
## 10               Upper East Side   1479    245 6.036735
## 11               Upper West Side    881    168 5.244048
## 12                     West Side     53     28 1.892857

Compare ratio of rental to sales properties by neighborhood

neighborhood_counts_rentals$num_rental_properties <- neighborhood_counts_rentals$Freq
neighborhood_counts_rentals$Freq <- NULL

neighborhood_counts_sales$num_sales_properties <- neighborhood_counts_sales$Freq
neighborhood_counts_sales$Freq <- NULL

neighborhood_counts <- neighborhood_counts_rentals %>% left_join(neighborhood_counts_sales, by = c("Var1"="Var1"))
neighborhood_counts %<>% mutate(ratio = num_rental_properties/num_sales_properties)
neighborhood_counts
##                             Var1 num_rental_properties num_sales_properties
## 1                 Central Harlem                   229                  347
## 2            Chelsea and Clinton                  1337                 1127
## 3                    East Harlem                   232                  122
## 4  Gramercy Park and Murray Hill                   929                 1344
## 5     Greenwich Village and Soho                   342                  463
## 6  Inwood and Washington Heights                   256                  253
## 7                Lower East Side                   628                  449
## 8                Lower Manhattan                   559                  443
## 9                        Tribeca                    10                   23
## 10               Upper East Side                   637                 1479
## 11               Upper West Side                  1045                  881
## 12                     West Side                    70                   53
##        ratio
## 1  0.6599424
## 2  1.1863354
## 3  1.9016393
## 4  0.6912202
## 5  0.7386609
## 6  1.0118577
## 7  1.3986637
## 8  1.2618510
## 9  0.4347826
## 10 0.4306964
## 11 1.1861521
## 12 1.3207547

Renting vs. Buying: Monthly Cost

Assuming that the buyer is taking out an 80% mortgage, what is the monthly cost of owning an apartment vs. renting one in Manhattan?

All Neighborhoods

Interestingly rent is higher for studios, then otherwise owning is more expensive.

sales_data_annotated %>% mutate(type = "sale") %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales") + scale_y_log10() + facet_wrap(~beds) +
  geom_signif(comparisons = list(c("rental", "sale")), 
              map_signif_level = TRUE, 
              textsize = 3, 
              tip_length = 0.02, 
              step_increase = 0.1,
              y_position = 3)

By Neighborhood

Although interesting overall, neighborhood also plays apart, so we should look at that:

Studios
sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 0) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 0) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for Studio Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
  geom_signif(comparisons = list(c("rental", "sale")), 
              map_signif_level = TRUE, 
              textsize = 3, 
              tip_length = 0.02, 
              step_increase = 0.1,
              y_position = 4)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")

sales_data_annotated %>% mutate(type = case_when(
  building_type_simple == "CONDO" ~ "sale_condo",
  building_type_simple == "CO-OP" ~ "sale_coop"
)) %>% filter(beds == 0) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 0) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for Studio Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
  geom_signif(comparisons = list(c("rental", "sale_condo"), c("rental", "sale_coop")), 
              map_signif_level = TRUE, 
              textsize = 3, 
              tip_length = 0.02, 
              step_increase = 0.1,
              y_position = 3)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")

##### 1 Bedroom

sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 1) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 1) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 1 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
  geom_signif(comparisons = list(c("rental", "sale")), 
              map_signif_level = TRUE, 
              textsize = 3, 
              tip_length = 0.02, 
              step_increase = 0.1,
              y_position = 4)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")

sales_data_annotated %>% mutate(type = case_when(
  building_type_simple == "CONDO" ~ "sale_condo",
  building_type_simple == "CO-OP" ~ "sale_coop"
)) %>% filter(beds == 1) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 1) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 1 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
  geom_signif(comparisons = list(c("rental", "sale_condo"), c("rental", "sale_coop")), 
              map_signif_level = TRUE, 
              textsize = 3, 
              tip_length = 0.02, 
              step_increase = 0.1,
              y_position = 3)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")

##### 2 Bedroom

sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 2) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 2) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 2 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
  geom_signif(comparisons = list(c("rental", "sale")), 
              map_signif_level = TRUE, 
              textsize = 3, 
              tip_length = 0.02, 
              step_increase = 0.1,
              y_position = 4)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")

sales_data_annotated %>% mutate(type = case_when(
  building_type_simple == "CONDO" ~ "sale_condo",
  building_type_simple == "CO-OP" ~ "sale_coop"
)) %>% filter(beds == 2) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 2) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 2 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
  geom_signif(comparisons = list(c("rental", "sale_condo"), c("rental", "sale_coop")), 
              map_signif_level = TRUE, 
              textsize = 3, 
              tip_length = 0.02, 
              step_increase = 0.1,
              y_position = 3)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")

3 Bedroom
sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 3) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 3) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 3 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
  geom_signif(comparisons = list(c("rental", "sale")), 
              map_signif_level = TRUE, 
              textsize = 3, 
              tip_length = 0.02, 
              step_increase = 0.1,
              y_position = 4)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")

sales_data_annotated %>% mutate(type = case_when(
  building_type_simple == "CONDO" ~ "sale_condo",
  building_type_simple == "CO-OP" ~ "sale_coop"
)) %>% filter(beds == 3) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 3) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 3 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
  geom_signif(comparisons = list(c("rental", "sale_condo"), c("rental", "sale_coop")), 
              map_signif_level = TRUE, 
              textsize = 3, 
              tip_length = 0.02, 
              step_increase = 0.1,
              y_position = 3)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")

##### 4 Bedroom

sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 4) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 4) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 4 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
  geom_signif(comparisons = list(c("rental", "sale")), 
              map_signif_level = TRUE, 
              textsize = 3, 
              tip_length = 0.02, 
              step_increase = 0.1,
              y_position = 4)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")

sales_data_annotated %>% mutate(type = case_when(
  building_type_simple == "CONDO" ~ "sale_condo",
  building_type_simple == "CO-OP" ~ "sale_coop"
)) %>% filter(beds == 4) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 4) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) %>% ggplot(aes(x = type, y = monthly_payment, color = type)) + geom_boxplot(outlier.shape =NA) + geom_jitter(alpha=0.2, size=0.3) + theme_bw() + xlab("Type") + ylab("Monthly Payment ($)") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Monthly Payment in Rentals vs. Sales for 4 Bedroom Apartments") + scale_y_log10() + facet_wrap(~neighborhood) +
  geom_signif(comparisons = list(c("rental", "sale_condo"), c("rental", "sale_coop")), 
              map_signif_level = TRUE, 
              textsize = 3, 
              tip_length = 0.02, 
              step_increase = 0.1,
              y_position = 3)
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")

Generally, it seems that as bedrooms increase, the difference in monthly payment between rentals and sales increase, there also seem to be some strong neighborhood effects. Interestingly, it seems like there isn’t a significant difference in overall monthly cost for large swaths of apartments, particularly in the 0 and 1 bedroom ranges.

What % More Expensive is Buying vs. Renting?

test_data <- sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 0) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 0) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) 
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "sale") %>% pull(monthly_price_sqft) %>% median() / test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "rental") %>% pull(monthly_price_sqft) %>% median()
## [1] 1.042845

The average monthly cost of a studio in Manhattan is only 5% higher than a Rental (4% median)

test_data <- sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 1) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 1) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) 
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "sale") %>% pull(monthly_price_sqft) %>% median() / test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "rental") %>% pull(monthly_price_sqft) %>% median()
## [1] 1.08446

10% higher for 1 bedrooms (mean), (8% median)

test_data <- sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 2) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 2) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) 
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "sale") %>% pull(monthly_price_sqft) %>% median() / test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "rental") %>% pull(monthly_price_sqft) %>% median()
## [1] 1.288779

29% higher for 2 bedrooms (29% median)

test_data <- sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 3) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 3) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) 
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "sale") %>% pull(monthly_price_sqft) %>% median() / test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "rental") %>% pull(monthly_price_sqft) %>% median()
## [1] 1.46875

57% for 3 bedrooms (47% median)

test_data <- sales_data_annotated %>% mutate(type = "sale") %>% filter(beds == 4) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type) %>% full_join(rental_data %>% filter(beds == 4) %>% mutate(monthly_payment = price, type = "rental") %>% filter(!is.na(monthly_payment)) %>% select(neighborhood, beds, baths, sqft, monthly_payment, type)) 
## Joining, by = c("neighborhood", "beds", "baths", "sqft", "monthly_payment",
## "type")
test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "sale") %>% pull(monthly_price_sqft) %>% median() / test_data %>% filter(sqft >0 ) %>% mutate(monthly_price_sqft = monthly_payment/sqft) %>% filter(type == "rental") %>% pull(monthly_price_sqft) %>% median()
## [1] 1.659079

111% for 4 bedrooms (66% median)

Buying Condos vs. Co-Ops

Are There More Condos or Co-Op Listings?

There seems to be almost double the amount of condos to co-ops listed in this subset

building_type_counts <- data.frame(table(sales_data_annotated$building_type_simple))
building_type_counts
##    Var1 Freq
## 1 CO-OP  487
## 2 CONDO  881

Comparing HOA Fees

Wherever there is data comparing CO-OPs and CONDOs, CO-OPs have much higher HOA fees.

# Define a custom y position for the p-value labels
custom_label_y <- exp(mean(log(sales_data_annotated$hoa_fee), na.rm = TRUE))

# Assuming `sales_data_annotated` is your data frame
sales_data_annotated %>%
  filter(beds < 6) %>%
  ggplot(aes(x = neighborhood, y = hoa_fee, color = building_type_simple)) + 
  geom_boxplot(outlier.shape = NA) + 
  theme_bw() + 
  xlab("Neighborhood") + 
  ylab("HOA Fees") + 
  theme(legend.position = "right", 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + 
  ggtitle("HOA Fees by # Beds and Neighborhood") + 
  scale_y_log10() + 
  facet_wrap(~beds) 

Comparing Price/Sqft

How does the price per squarefoot compare between Co-Ops and Condos?

sales_data_annotated %>% filter(sqft > 0) %>% mutate(monthly_payment_per_sqft = price/sqft) %>% ggplot(aes(x = building_type_simple, y = monthly_payment_per_sqft, color = building_type_simple)) + 
  geom_boxplot(outlier.shape = NA) + 
  geom_jitter(alpha = 0.2) + 
  theme_bw() + 
  xlab("Neighborhood") + 
  ylab("Price per Sqft") + 
  theme(legend.position = "right", 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + 
  ggtitle("Price/Sqft of Co-Op vs. Condo") +
  stat_compare_means(method = "t.test", label = "p.signif", comparisons = list(c("CO-OP", "CONDO")))
## [1] FALSE

price_per_sqft_data <- sales_data_annotated %>%
  filter(sqft > 0) %>%
  mutate(monthly_payment_per_sqft = price / sqft)

# Calculate the mean price per square foot for each building type
mean_price_per_sqft <- price_per_sqft_data %>%
  group_by(building_type_simple) %>%
  summarise(mean_price_per_sqft = mean(monthly_payment_per_sqft, na.rm = TRUE))

mean_price_per_sqft
## # A tibble: 2 × 2
##   building_type_simple mean_price_per_sqft
##   <chr>                              <dbl>
## 1 CO-OP                              1109.
## 2 CONDO                              1711.

1711.269/1109.424 = 1.54 (about 50% more expensive per sqft on average)

sales_data_annotated %>% filter(building_type_simple == "CONDO", neighborhood == "Upper East Side", beds == 1) %>% pull(historical_cagr) %>% median()
## [1] 0.04784556

Investment Analysis

How Do Historic Returns (Broadly) Compare vs. S&P 500?

When I looked it up, I saw the CAGR for the S&P500 is 9.27%, so I’m going to use that as a benchmark to compare against the CAGR for these listings

sales_data_annotated %>% ggplot(aes(x = neighborhood, y = historical_cagr, color = building_type_simple)) + geom_boxplot(outlier.shape =NA) + theme_bw() + xlab("Neighborhood") + ylab("Historical CAGR") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("All Units") + scale_y_log10() + geom_hline(yintercept = 0.0927, linetype = "dashed", color = "red")

The baseline historical CAGR for these apartments seems to be below the S&P 500 (but you can actually multiply it using a mortgage, which you can’t really do with others)

Breakdown by Bedroom & Neighborhood

Facet By Beds (1-4)

sales_data_annotated %>% filter(beds < 6) %>% ggplot(aes(x = neighborhood, y = historical_cagr, color = building_type_simple)) + geom_boxplot(outlier.shape =NA)+ theme_bw() + xlab("Neighborhood") + ylab("Historical CAGR") + theme(legend.position = "right", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Historic Returns by # Beds and Neighborhood") + scale_y_log10() + geom_hline(yintercept = 0.0927, linetype = "dashed", color = "red") + facet_wrap(~beds)

Identifying Ideal Property Types

Ideally, we want to identify a neighborhood x apartment type that has high historic returns with lower volatility, and many data points so that we have higher confidence in the data

While 4 bedroom condos in Lower Manhattan have the best overall profile (9% median returns with .2% SD, there are only 2 points so its not reliable)

# n = 25 is arbitrary 
sales_data_annotated %>%
  group_by(beds, building_type_simple, neighborhood) %>% filter(n() > 25) %>%
  summarise(
    median_cagr = median(historical_cagr, na.rm = TRUE),
    std_dev = sd(historical_cagr, na.rm = TRUE),
    count = n(),
    ratio = median(historical_cagr, na.rm = TRUE)/sd(historical_cagr, na.rm = TRUE)
  ) %>% arrange(desc(ratio)) 
## `summarise()` has grouped output by 'beds', 'building_type_simple'. You can
## override using the `.groups` argument.
## # A tibble: 17 × 7
## # Groups:   beds, building_type_simple [6]
##     beds building_type_simple neighborhood          media…¹ std_dev count  ratio
##    <dbl> <chr>                <chr>                   <dbl>   <dbl> <int>  <dbl>
##  1     1 CONDO                Upper East Side        0.0478  0.0383    39 1.25  
##  2     1 CONDO                Upper West Side        0.0581  0.111     38 0.524 
##  3     0 CO-OP                Gramercy Park and Mu…  0.0301  0.0657    31 0.458 
##  4     1 CONDO                Chelsea and Clinton    0.0423  0.109     84 0.388 
##  5     2 CONDO                Chelsea and Clinton    0.0391  0.136     73 0.287 
##  6     1 CONDO                Gramercy Park and Mu…  0.0519  0.191     50 0.271 
##  7     2 CONDO                Upper West Side        0.0576  0.220     30 0.261 
##  8     1 CO-OP                Gramercy Park and Mu…  0.0217  0.0848    72 0.255 
##  9     2 CO-OP                Upper East Side        0.0428  0.186     46 0.230 
## 10     2 CONDO                Gramercy Park and Mu…  0.0521  0.311     57 0.168 
## 11     1 CO-OP                Upper East Side        0.0317  0.192     47 0.165 
## 12     1 CO-OP                Chelsea and Clinton    0.0483  0.298     26 0.162 
## 13     2 CONDO                Upper East Side        0.0436  0.272     43 0.160 
## 14     1 CONDO                Lower Manhattan        0.0237  0.195     42 0.122 
## 15     2 CONDO                Lower Manhattan        0.0244  0.211     33 0.116 
## 16     3 CONDO                Gramercy Park and Mu…  0.0194  0.232     26 0.0834
## 17     2 CO-OP                Gramercy Park and Mu…  0.0412  1.76      38 0.0234
## # … with abbreviated variable name ¹​median_cagr

1 Bedroom Condos in the upper east side seem to be the best, with ~5% median CAGR with 4% standard deviation and 39 data points.

Income / Capital Requirements

At the lowest end, you’d need an annual income of at least 137151.5 and $99,0000, but you could own a very good apartment. To afford median monthly payment, you’d want $218k annual salary

Minimum:

listing_data <- sales_data_annotated

# Use the 40X rule to see what minimum income you'd need to afford
listing_data %>% filter(neighborhood == "Upper East Side", building_type_simple=="CONDO", beds == 1) %>% pull(monthly_payment) %>% min() * 40 # $113,151.5
## [1] 113151.5
listing_data %>% filter(neighborhood == "Upper East Side", building_type_simple=="CONDO", beds == 1) %>% pull(downpayment) %>% min() # $99,000
## [1] 99000

Median:

# Use the 40X rule to see what minimum income you'd need to afford
listing_data %>% filter(neighborhood == "Upper East Side", building_type_simple=="CONDO", beds == 1) %>% pull(monthly_payment) %>% median() * 40 # $218,302
## [1] 218302.4
listing_data %>% filter(neighborhood == "Upper East Side", building_type_simple=="CONDO", beds == 1) %>% pull(downpayment) %>% median()  # $191,000
## [1] 191000

5 Year Projections

We want to see what the equity in the property would be after 5 years, and also what the appreciation on it would be like. To compute this, we use the historical CAGR of the property to project the future value and then calculate the equity built after several years by building an amoritization table for each property.

listing_data$remaining_balance <- NULL
listing_data$equity <- NULL

n_years <- 5 # years 
n_years_in_months <- n_years*12
listing_data$loan_amount <- listing_data$downpayment / downpayment_percent

# Parameters
annual_interest_rate <- 0.0772  # 2% annual interest
compounding_frequency <- 12   # Monthly compounding
payment_frequency <- 12       # Monthly payments
loan_term_years <- 30


balance_at_target_year <- c()
for (i in seq(length(listing_data$`Unnamed: 0`))) {

  # Calculate the amortization table for each property
  amortization_table <- amort.table(
    Loan = listing_data[i, ] %>% pull(loan_amount),
    n = loan_term_years * payment_frequency,  # Total number of payments
    i = annual_interest_rate,
    ic = compounding_frequency,
    pf = payment_frequency,
    plot = FALSE  # Set TRUE if you want to plot payment proportions
  )
  
  # extract the data
  data.frame(amortization_table$Schedule)[n_years_in_months, ] %>% pull(`Principal.Paid`)
  
  balance_at_target_year <- c(balance_at_target_year, data.frame(amortization_table$Schedule)[n_years_in_months, ] %>% pull(`Balance`))
  
    
}

options(scipen = 999)

listing_data$balance_at_target_year <- balance_at_target_year
listing_data$equity_at_target_year <- listing_data$loan_amount - listing_data$balance_at_target_year + listing_data$downpayment
listing_data$balance_paid_off_at_target_year <- listing_data$loan_amount - listing_data$balance_at_target_year 

# this is purely appreciation on the initial down payment (not considering equity)
listing_data$est_roi_on_downpayment <- (listing_data$historical_cagr*n_years*listing_data$unformattedPrice)/listing_data$downpayment
listing_data$est_roi_vs_benchmark <- (listing_data$est_roi_on_downpayment - 0.0927*n_years)/n_years

listing_data
## # A tibble: 1,368 × 81
##     ...1 Unnamed:…¹   zpid     id rawHo…² marke…³ provi…⁴ imgSrc hasIm…⁵ detai…⁶
##    <dbl>      <dbl>  <dbl>  <dbl> <chr>   <chr>     <dbl> <chr>  <lgl>   <chr>  
##  1     3         44 1.20e8 1.20e8 ForSale For Sa… 1681982 https… TRUE    https:…
##  2    21         62 8.20e7 8.20e7 ForSale For Sa… 1682235 https… TRUE    https:…
##  3    38         79 9.46e7 9.46e7 ForSale For Sa… 1672547 https… TRUE    https:…
##  4    55         96 8.19e7 8.19e7 ForSale For Sa… 1701687 https… TRUE    https:…
##  5    64        105 2.20e8 2.20e8 ForSale For Sa… 1696777 https… TRUE    https:…
##  6    84        125 7.25e7 7.25e7 ForSale For Sa… 1676462 https… TRUE    https:…
##  7    91        132 7.25e7 7.25e7 ForSale For Sa… 1689159 https… TRUE    https:…
##  8    92        133 7.25e7 7.25e7 ForSale For Sa… 1695364 https… TRUE    https:…
##  9    95        136 1.22e8 1.22e8 ForSale For Sa… 1628931 https… TRUE    https:…
## 10   110        151 7.25e7 7.25e7 ForSale For Sa… 1697649 https… TRUE    https:…
## # … with 1,358 more rows, 71 more variables: statusType <chr>,
## #   statusText <chr>, countryCurrency <chr>, price <dbl>,
## #   unformattedPrice <dbl>, address <chr>, addressStreet <chr>,
## #   addressCity <chr>, addressState <chr>, addressZipcode <dbl>,
## #   isUndisclosedAddress <lgl>, beds <dbl>, baths <dbl>, area <dbl>,
## #   latLong <chr>, isZillowOwned <lgl>, variableData <chr>, hdpData <chr>,
## #   isSaved <lgl>, isUserClaimingOwner <lgl>, isUserConfirmedClaim <lgl>, …
Comparing Excess Returns

We look at all the listings and their historical CAGR to see how well they would perform against the S&P 500, generally most (if financed) would yield a greater ROI on the down payment, but have high volatility

# Calculate the IQR and determine bounds for outlier removal
listing_data_2 <- listing_data %>% filter(beds < 4) %>%
  group_by(neighborhood, building_type_simple) %>%
  mutate(Q1 = quantile(est_roi_vs_benchmark, 0.25),
         Q3 = quantile(est_roi_vs_benchmark, 0.75)) %>%
  mutate(IQR = Q3 - Q1,
         Lower_Bound = Q1 - 1.5 * IQR,
         Upper_Bound = Q3 + 1.5 * IQR) %>%
  filter(est_roi_vs_benchmark >= Lower_Bound & est_roi_vs_benchmark <= Upper_Bound) %>%
  ungroup()

# only look at neighborhoods with data > 50 (higher certainty)
listing_data_2 %<>% group_by(neighborhood) %>% filter(n() > 50)

# Plotting the filtered data without outliers
ggplot(listing_data_2, aes(x = neighborhood, y = est_roi_vs_benchmark, color = building_type_simple)) +
  geom_boxplot() +  # Now the boxplot will naturally have no outliers as they were filtered out
  geom_jitter(alpha = 0.4) +  # Adds a jitter plot to display all data points
  theme_bw() +  # Applies a minimal theme
  xlab("Neighborhood") +
  ylab("Excess CAGR vs. Benchmark") +
  theme(legend.position = "right", 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ggtitle("All Units") + geom_hline(yintercept = 0, linetype = "dashed", color = "red") + facet_wrap(~beds) +
  geom_signif(comparisons = list(c("CO-OP", "CONDO")), 
              map_signif_level = TRUE, 
              textsize = 3, 
              tip_length = 0.02, 
              step_increase = 0.1,
              y_position = 0)

Sharpe Ratio for our “Ideal” Category

bed_num <- 1

portfolio_return <- listing_data %>% filter(beds == bed_num) %>% filter(neighborhood == "Upper East Side") %>% pull(historical_cagr) %>% mean()
risk_free_rate <- 0.035    # current rate for treasury bonds (3.5%) 
standard_deviation <- listing_data %>% filter(beds == bed_num) %>% filter(neighborhood == "Upper East Side") %>% pull(historical_cagr) %>% sd()  # 12%

# Calculate excess return
excess_return <- portfolio_return - risk_free_rate

# Calculate Sharpe Ratio (alpha)
sharpe_ratio <- excess_return / standard_deviation

sharpe_ratio
## [1] 0.1497269

What % of Listings Have Higher Historic Returns than S&P 500 (With a Mortgage)

listing_data %>% filter(est_roi_vs_benchmark > 0) %>% pull(est_roi_vs_benchmark) %>% length()
## [1] 950
listing_data %>% filter(est_roi_vs_benchmark < 0) %>% pull(est_roi_vs_benchmark) %>% length()
## [1] 418

After 5 years, 418 did not beat benchmark, 950 did (69% of total)

Breakdown by Apt Type

Studio: 71.6% 1 Bedroom: 70.0% 2 Bedroom: 72.4% 3 Bedroom: 61.3% 4 Bedroom: 59.2%

bed_num = 4
listing_data %>% filter(beds == bed_num) %>% filter(est_roi_vs_benchmark > 0) %>% pull(est_roi_vs_benchmark) %>% length() / (listing_data %>% filter(beds == bed_num) %>% pull(est_roi_vs_benchmark)) %>% length()
## [1] 0.5918367

Are HOA Fees Associated with Returns?

# Try compare hoa fee with cagr directly (no significant relationship)
lm_fit <- lm(formula = historical_cagr ~ hoa_fee, data = listing_data)
summary(lm_fit)
## 
## Call:
## lm(formula = historical_cagr ~ hoa_fee, data = listing_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0850 -0.0730 -0.0458 -0.0094 30.9711 
## 
## Coefficients:
##                 Estimate   Std. Error t value Pr(>|t|)  
## (Intercept)  0.087925088  0.035617787   2.469   0.0137 *
## hoa_fee     -0.000001456  0.000011366  -0.128   0.8981  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9111 on 1360 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  1.206e-05,  Adjusted R-squared:  -0.0007232 
## F-statistic: 0.0164 on 1 and 1360 DF,  p-value: 0.8981
# Try to normalize by taking hoa fee per sqft (no significant linear relationship)
lm_fit <- lm(formula = historical_cagr ~ hoa_fee_per_sqft, data = listing_data %>% filter(sqft > 0) %>% mutate(hoa_fee_per_sqft = hoa_fee/sqft))
summary(lm_fit)
## 
## Call:
## lm(formula = historical_cagr ~ hoa_fee_per_sqft, data = listing_data %>% 
##     filter(sqft > 0) %>% mutate(hoa_fee_per_sqft = hoa_fee/sqft))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.04916 -0.04221 -0.01268  0.02683  1.91377 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      0.044416   0.016636   2.670  0.00773 **
## hoa_fee_per_sqft 0.005559   0.009378   0.593  0.55351   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2146 on 891 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.0003942,  Adjusted R-squared:  -0.0007277 
## F-statistic: 0.3513 on 1 and 891 DF,  p-value: 0.5535

Plot to visually inspect

listing_data %>% ggplot(aes(x = hoa_fee, y = historical_cagr)) + geom_point() + theme_bw() + scale_y_log10() + scale_x_log10() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'